May 16, 2016
ggplot(Data)geom_line(), geom_bar(), geom_point()ggplot(Data,aes(x=1, y=1, color='black'))scale_fill_continuous(), scale_fill_grey()stat_density2d()dist(), hclust()kmeans()距離還有聚合方式kmeans(), k=2kmeans(), k=3kmeans(), k=4按Raw,右鍵另存新檔
投影片下載:
按右鍵,另存新檔
SSE: The sum of the squared distance between each member of a cluster and its cluster centroid.
withinssdataMatrix <- as.matrix(dataFrame)[sample(1:12),]
wss <- (nrow(dataMatrix)-1)*sum(apply(dataMatrix,2,var))
for (i in 2:(nrow(dataMatrix)-1)) {
wss[i] <- sum(kmeans(dataMatrix,centers=i)$withinss)
}
par(mfrow=c(1,1), mar = c(4,4,1,1)) #下,左,上,右
plot(1:(nrow(dataMatrix)-1), wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
dataMatrix2 <- as.matrix(mtcars) ## Randomly insert some missing data dataMatrix2[sample(1:100,size=10,replace=FALSE)] <- NA head(dataMatrix2,10)
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 NA 160.0 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258.0 NA 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 ## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 ## Merc 240D 24.4 NA NA 62 3.69 3.190 20.00 1 0 4 2 ## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 ## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
用knn的方法計算空值可能可以帶入的數值
#source("https://bioconductor.org/biocLite.R")
#biocLite("impute")
library(impute)
dataMatrix2 <- impute.knn(dataMatrix2)$data
head(dataMatrix2,10)
## mpg cyl disp hp drat wt qsec vs am gear ## Mazda RX4 21.0 6.000000 160.00 110 3.90 2.620 16.46 0 1 4 ## Mazda RX4 Wag 21.0 5.555556 160.00 110 3.90 2.875 17.02 0 1 4 ## Datsun 710 22.8 4.000000 108.00 93 3.85 2.320 18.61 1 1 4 ## Hornet 4 Drive 21.4 6.000000 258.00 164 3.08 3.215 19.44 1 0 3 ## Hornet Sportabout 18.7 8.000000 360.00 175 3.15 3.440 17.02 0 0 3 ## Valiant 18.1 6.000000 225.00 105 2.76 3.460 20.22 1 0 3 ## Duster 360 14.3 8.000000 360.00 245 3.21 3.570 15.84 0 0 3 ## Merc 240D 24.4 4.400000 127.67 62 3.69 3.190 20.00 1 0 4 ## Merc 230 22.8 4.000000 140.80 95 3.92 3.150 22.90 1 0 4 ## Merc 280 19.2 6.000000 167.60 123 3.92 3.440 18.30 1 0 4 ## carb ## Mazda RX4 4 ## Mazda RX4 Wag 4 ## Datsun 710 1 ## Hornet 4 Drive 1 ## Hornet Sportabout 2 ## Valiant 1 ## Duster 360 4 ## Merc 240D 2 ## Merc 230 2 ## Merc 280 4
Machine learning from data 輸入資料學習新資訊,變成智慧的演算法演算法可以基於輸入資料,預測事件或協助決策
資料太少?太髒?–>學不好
Data —- Machine Learning —-> Skill
Skill: 變準/變好/賺更多…etc
intelligent behaviorML is one possible and popular route to realize AI
學學這些模式/模型了解兩個或多個變數間是否相關、相關方向與強度,並建立數學模型以便觀察特定變數來預測研究者感興趣的變數
Linear Regression 線性迴歸#讀入SportsAnalytics package
if (!require('SportsAnalytics')){
install.packages("SportsAnalytics")
library(SportsAnalytics)
}
#擷取2015-2016年球季球員資料
NBA1516<-fetch_NBAPlayerStatistics("15-16")
得分與上場分鐘數的線性迴歸分析library(ggplot2)
ggplot(NBA1516,aes(x=TotalMinutesPlayed,y=TotalPoints))+
geom_point()+geom_smooth(method = "glm")
glm()# formula: Y~X1+X2+...+Xn Y:依變項 X:自變項 # data: 資料 glm(TotalPoints~TotalMinutesPlayed,data =NBA1516)
## ## Call: glm(formula = TotalPoints ~ TotalMinutesPlayed, data = NBA1516) ## ## Coefficients: ## (Intercept) TotalMinutesPlayed ## -85.9071 0.4931 ## ## Degrees of Freedom: 475 Total (i.e. Null); 474 Residual ## Null Deviance: 99360000 ## Residual Deviance: 16720000 AIC: 6339
TotalPoints = 0.4931 * TotalMinutesPlayed -85.9071
glm()是廣義線性迴歸模型lm()family="gaussian" 線性模型模型family="binomial" 邏輯迴歸模型family="poisson" 卜瓦松迴歸模型高斯函數是常態分布的密度函數
二項分布是n個獨立的是/非試驗中成功的次數的離散機率分布
次數分佈
得分與上場分鐘數和兩分球出手數的關係 - 多變量線性迴歸分析# e+01: 10^1 / e-04: 10^(-4)
glm(TotalPoints~TotalMinutesPlayed+FieldGoalsAttempted,
data =NBA1516)
## ## Call: glm(formula = TotalPoints ~ TotalMinutesPlayed + FieldGoalsAttempted, ## data = NBA1516) ## ## Coefficients: ## (Intercept) TotalMinutesPlayed FieldGoalsAttempted ## -1.799e+01 -2.347e-04 1.256e+00 ## ## Degrees of Freedom: 475 Total (i.e. Null); 473 Residual ## Null Deviance: 99360000 ## Residual Deviance: 2160000 AIC: 5367
TotalPoints = -0.0002347 * TotalMinutesPlayed + 1.255794 *FieldGoalsAttempted -17.99
得分與上場分鐘數和兩分球出手數和守備位置的關係 - 多變量線性迴歸分析glm(TotalPoints~TotalMinutesPlayed+FieldGoalsAttempted+Position,
data =NBA1516)
## ## Call: glm(formula = TotalPoints ~ TotalMinutesPlayed + FieldGoalsAttempted + ## Position, data = NBA1516) ## ## Coefficients: ## (Intercept) TotalMinutesPlayed FieldGoalsAttempted ## 22.852223 -0.006537 1.275721 ## PositionPF PositionPG PositionSF ## -39.416327 -65.034646 -38.522299 ## PositionSG ## -52.175144 ## ## Degrees of Freedom: 474 Total (i.e. Null); 468 Residual ## (1 observation deleted due to missingness) ## Null Deviance: 99080000 ## Residual Deviance: 1975000 AIC: 5322
# e+01: 10^1 / e-04: 10^(-4)
TotalPoints = -0.0065 * TotalMinutesPlayed + 1.28 FieldGoalsAttempted +22.85 + 22.85 PositionPF + -65.03 * PositionPG + -38.52 * PositionSF + -52.18 * PositionSG
虛擬變項類別變項請記得轉成factor,R會自動建立虛擬變項依變數為連續變數,自變數為連續變數或虛擬變數的場合class(NBA1516$Position)
## [1] "factor"
levels(NBA1516$Position)
## [1] "C" "PF" "PG" "SF" "SG"
OneVar<-glm(TotalPoints~TotalMinutesPlayed,data =NBA1516)
TwoVar<-glm(TotalPoints~TotalMinutesPlayed+FieldGoalsAttempted,
data =NBA1516)
ThreeVar<-glm(TotalPoints~TotalMinutesPlayed+FieldGoalsAttempted+Position,
data =NBA1516)
c(OneVar$aic,TwoVar$aic,ThreeVar$aic)
## [1] 6338.913 5366.763 5321.972
sum2<-summary(TwoVar) sum2$coefficients
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -1.798855e+01 5.659758251 -3.17832538 1.578333e-03 ## TotalMinutesPlayed -2.347183e-04 0.009474631 -0.02477334 9.802462e-01 ## FieldGoalsAttempted 1.255794e+00 0.022239494 56.46682752 2.474028e-212
sum3<-summary(ThreeVar) sum3$coefficients
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 22.852222668 9.014714391 2.5349913 1.156964e-02 ## TotalMinutesPlayed -0.006536874 0.009199968 -0.7105322 4.777281e-01 ## FieldGoalsAttempted 1.275721212 0.021647176 58.9324535 1.144607e-218 ## PositionPF -39.416326742 9.936541704 -3.9668053 8.425605e-05 ## PositionPG -65.034646215 10.269250388 -6.3329497 5.648565e-10 ## PositionSF -38.522298887 10.488170409 -3.6729284 2.674727e-04 ## PositionSG -52.175143670 9.985331185 -5.2251791 2.625062e-07
Logistic Regression 羅吉斯迴歸依變數為二元變數(非0即1)的場合family="binomial" 邏輯迴歸模型mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
# GRE:某考試成績, GPA:在校平均成績, rank:學校聲望
head(mydata)
## admit gre gpa rank ## 1 0 380 3.61 3 ## 2 1 660 3.67 3 ## 3 1 800 4.00 1 ## 4 1 640 3.19 4 ## 5 0 520 2.93 4 ## 6 1 760 3.00 2
mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank,
data = mydata, family = "binomial")
sum<-summary(mylogit)
sum$coefficients
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -3.989979073 1.139950936 -3.500132 0.0004650273 ## gre 0.002264426 0.001093998 2.069864 0.0384651284 ## gpa 0.804037549 0.331819298 2.423119 0.0153878974 ## rank2 -0.675442928 0.316489661 -2.134171 0.0328288188 ## rank3 -1.340203916 0.345306418 -3.881202 0.0001039415 ## rank4 -1.551463677 0.417831633 -3.713131 0.0002047107
在樹狀目錄中建立一系列分割,以建立模型。這些分割會表示成「節點」(Node)。每次發現輸入資料行與可預測資料行有明顯地相互關聯時,此演算法就會在模型中加入一個節點。演算法決定分岔的方式不同,視它預測連續資料行或分隔資料行而定。
if (!require('rpart')){
install.packages("rpart")
library(rpart)
}
DT<-rpart(Position~Blocks+ThreesMade+Assists+Steals,data=NBA1516)
DT
## n=475 (1 observation deleted due to missingness) ## ## node), split, n, loss, yval, (yprob) ## * denotes terminal node ## ## 1) root 475 364 PF (0.15 0.23 0.21 0.18 0.23) ## 2) ThreesMade< 2.5 132 74 C (0.44 0.35 0.098 0.053 0.061) ## 4) Blocks>=4.5 89 37 C (0.58 0.38 0.011 0.011 0.011) * ## 5) Blocks< 4.5 43 31 PF (0.14 0.28 0.28 0.14 0.16) ## 10) Steals< 2.5 29 19 PF (0.17 0.34 0.14 0.21 0.14) * ## 11) Steals>=2.5 14 6 PG (0.071 0.14 0.57 0 0.21) * ## 3) ThreesMade>=2.5 343 242 SG (0.035 0.19 0.25 0.23 0.29) ## 6) Assists>=170.5 96 39 PG (0.031 0.052 0.59 0.15 0.18) * ## 7) Assists< 170.5 247 163 SG (0.036 0.24 0.12 0.26 0.34) ## 14) Blocks>=20.5 80 42 PF (0.062 0.48 0 0.26 0.2) ## 28) Steals< 59.5 58 21 PF (0.069 0.64 0 0.14 0.16) * ## 29) Steals>=59.5 22 9 SF (0.045 0.045 0 0.59 0.32) * ## 15) Blocks< 20.5 167 99 SG (0.024 0.13 0.17 0.26 0.41) ## 30) Assists< 81.5 110 68 SG (0.027 0.18 0.091 0.32 0.38) ## 60) Blocks>=4.5 63 39 SF (0.032 0.29 0.016 0.38 0.29) ## 120) ThreesMade< 13.5 19 9 PF (0.11 0.53 0 0.26 0.11) * ## 121) ThreesMade>=13.5 44 25 SF (0 0.18 0.023 0.43 0.36) ## 242) Blocks< 9.5 17 7 SF (0 0.18 0.059 0.59 0.18) * ## 243) Blocks>=9.5 27 14 SG (0 0.19 0 0.33 0.48) * ## 61) Blocks< 4.5 47 23 SG (0.021 0.043 0.19 0.23 0.51) * ## 31) Assists>=81.5 57 31 SG (0.018 0.035 0.33 0.16 0.46) ## 62) ThreesMade< 37 17 5 PG (0 0.12 0.71 0.059 0.12) * ## 63) ThreesMade>=37 40 16 SG (0.025 0 0.17 0.2 0.6) *
#控球後衛(PG)、得分後衛(SG)、小前鋒(SF)、大前鋒(PF)和中鋒(C)
par(mfrow=c(1,1), mar = rep(1,4)) #下,左,上,右 plot(DT) text(DT, use.n=F, all=F, cex=1)
#控球後衛(PG)、得分後衛(SG)、小前鋒(SF)、大前鋒(PF)和中鋒(C)
預設的plot()真的太難用,改用rpart.plot package裡面的prp()
if (!require('rpart.plot')){
install.packages("rpart.plot")
library(rpart.plot)
}
prp(DT) # Will plot the tree
prp(DT)
節點…有機會再說吧……
布林關聯規則(Boolean association rules)頻繁項集的算法# Load the libraries
if (!require('arules')){
install.packages("arules");library(arules)
}
if (!require('datasets')){
install.packages("datasets");library(datasets)
}
data(Groceries) # Load the data set
Groceries@data@Dim #169 種商品,9835筆交易資料
## [1] 169 9835
itemFrequencyPlot(Groceries,topN=20,type="absolute")
apriori()# Get the rules
rules <- apriori(Groceries,
parameter = list(supp = 0.001, conf = 0.8),
control = list(verbose=F))
# Show the top 5 rules, but only 2 digits
options(digits=2)
inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {liquor,red/blush wine} => {bottled beer} 0.0019 0.90 11.2
## 2 {curd,cereals} => {whole milk} 0.0010 0.91 3.6
## 3 {yogurt,cereals} => {whole milk} 0.0017 0.81 3.2
## 4 {butter,jam} => {whole milk} 0.0010 0.83 3.3
## 5 {soups,bottled beer} => {whole milk} 0.0011 0.92 3.6
Support: The fraction of which our item set occurs in our dataset. 一次交易中,包括規則內的物品的聯合機率Confidence: probability that a rule is correct for a new transaction with items on the left. 包含左邊物品A的交易也會包含右邊物品B的條件機率Lift: The ratio by which by the confidence of a rule exceeds the expected confidence. 規則的信心比期望值高多少
lift=1: items on the left and right are independent.rules<-sort(rules, by="confidence", decreasing=TRUE) inspect(rules[1:5])
## lhs rhs support confidence lift
## 1 {rice,
## sugar} => {whole milk} 0.0012 1 3.9
## 2 {canned fish,
## hygiene articles} => {whole milk} 0.0011 1 3.9
## 3 {root vegetables,
## butter,
## rice} => {whole milk} 0.0010 1 3.9
## 4 {root vegetables,
## whipped/sour cream,
## flour} => {whole milk} 0.0017 1 3.9
## 5 {butter,
## soft cheese,
## domestic eggs} => {whole milk} 0.0010 1 3.9
summary(rules)
## set of 410 rules ## ## rule length distribution (lhs + rhs):sizes ## 3 4 5 6 ## 29 229 140 12 ## ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 3.0 4.0 4.0 4.3 5.0 6.0 ## ## summary of quality measures: ## support confidence lift ## Min. :0.00102 Min. :0.80 Min. : 3.1 ## 1st Qu.:0.00102 1st Qu.:0.83 1st Qu.: 3.3 ## Median :0.00122 Median :0.85 Median : 3.6 ## Mean :0.00125 Mean :0.87 Mean : 4.0 ## 3rd Qu.:0.00132 3rd Qu.:0.91 3rd Qu.: 4.3 ## Max. :0.00315 Max. :1.00 Max. :11.2 ## ## mining info: ## data ntransactions support confidence ## Groceries 9835 0.001 0.8
買了什麼東西的人,會買牛奶呢?
rulesR<-apriori(data=Groceries, parameter=list(supp=0.001,conf = 0.08),
appearance = list(default="lhs",rhs="whole milk"),
control = list(verbose=F))
rulesR<-sort(rulesR, decreasing=TRUE,by="confidence")
inspect(rulesR[1:5])
## lhs rhs support confidence lift
## 1 {rice,
## sugar} => {whole milk} 0.0012 1 3.9
## 2 {canned fish,
## hygiene articles} => {whole milk} 0.0011 1 3.9
## 3 {root vegetables,
## butter,
## rice} => {whole milk} 0.0010 1 3.9
## 4 {root vegetables,
## whipped/sour cream,
## flour} => {whole milk} 0.0017 1 3.9
## 5 {butter,
## soft cheese,
## domestic eggs} => {whole milk} 0.0010 1 3.9
買了牛奶的人,會買什麼呢?
rulesL<-apriori(data=Groceries, parameter=list(supp=0.001,conf = 0.15,minlen=2),
appearance = list(default="rhs",lhs="whole milk"),
control = list(verbose=F))
rulesL<-sort(rulesL, decreasing=TRUE,by="confidence")
inspect(rulesL[1:5])
## lhs rhs support confidence lift
## 6 {whole milk} => {other vegetables} 0.075 0.29 1.5
## 5 {whole milk} => {rolls/buns} 0.057 0.22 1.2
## 4 {whole milk} => {yogurt} 0.056 0.22 1.6
## 2 {whole milk} => {root vegetables} 0.049 0.19 1.8
## 1 {whole milk} => {tropical fruit} 0.042 0.17 1.6
if (!require('arulesViz')){
install.packages("arulesViz"); library(arulesViz)
}
#Mac->http://planspace.org/2013/01/17/fix-r-tcltk-dependency-problem-on-mac/
plot(rules,method="graph",interactive=TRUE,shading=NA) #會跑一陣子